This project was designed to evaluate a Differential Gene Expression Analysis workflow following the same approach published in Empirical assessment of analysis workflows for differential expression analysis of human samples using RNA-Seq by Williams at al.
We tested five workflows for the DGA. All workflows used the same aligner (STAR) and quantification (TPMCalculator) tool but different DGA software: EdgeR, Deseq2, SAMSeq, the union and the intercept off all identified genes from all DGA tools. Recall and precision values were calculated using the same approach described in the paper using the expressions:
$Recall = \frac{G_{r} \cup G_{d}}{G_{d}}$
$Precision = \frac{G_{r} \cup G_{d}}{G_{r}}$
$G_r$ genes in reference and $G_d$ genes identified
We used paper's figure 5a to plot our results for comparison with the paper published results. Although, we obtained recall values under the fitted line our precision is over the fitting line showing better results than those published in the paper, see final plots here.
This analysis is testing the whole workflow STAR-TPMCalculator-DGA tool. In our case, we use the same tools than in the paper for alignment (STAR) and DGA (EdgeR, Deseq2 and SAMSeq). The only difference is the quantification step using TPMCalculator. Additionally, we used their scripts and parameters for the DGA tools which all are R packages.
We see an increment of the precision despite of using the same first and last steps published in the paper for the STAR-quantification-DGA based workflows. We concluded that the increment in precision is due to the introduction of the TPMCalculator tool.
Our workflow is based on a set of Jupyter Notebooks and CWL workflows. The workflows excuted the analysis using the following tools:
%run ../config/init.py
import gzip
import base64
from io import StringIO
from wand.image import Image as WImage
HTML(HIDE_CODE)
os.chdir(os.environ['NOTEBOOKS'])
name = '01 - Sample Retrieval'
str_msg = '<a href="' + name.replace(' ', '%20') + '.html" target="_blank">' + name + '</a>\n'
display(Markdown(str_msg))
f = os.path.join(os.environ['DATA'], os.environ['DATASET'], 'samples.tsv')
df = pandas.read_csv(f, sep='\t')
display(df)
del str_msg
data_dir = os.path.join(os.environ['DATA'], os.environ['DATASET'])
os.chdir(data_dir)
samples = [ f.replace('.fastq.gz','') for ds,dr,fs in os.walk('./') for f in fs if f.endswith('.fastq.gz')]
samples.sort()
os.chdir(os.environ['NOTEBOOKS'])
samples_data = {}
name = '02 - Pre-processing Quality Control'
str_msg = '<a href="' + name.replace(' ', '%20') + '.html" target="_blank">' + name + '</a>\n'
str_msg += '#### FastQC report\n'
str_msg += '| Sample | Fastq | FastQC<br>Report | No of Reads<br>in fastq | Seq<br> Len | %GC | Poor<br>Quality | Fail<br>Tests |\n'
str_msg += '| --- | --- | --- |--- | --- | --- | --- | --- |\n'
for s in samples:
str_msg += '| ' + s
str_msg += '| '
f = os.path.relpath(os.path.join(os.environ['DATA'], os.environ['DATASET'], s + '.fastq.gz'))
if os.path.exists(f):
size = os.path.getsize(f)/(1024.0*1024.0)
str_msg += ' <a href="'
str_msg += f.replace(' ', '%20')
str_msg += '" target="_blank">'
str_msg += "{0:.2f}".format(size)
str_msg += 'MB</a> |'
else:
str_msg += ' --- |'
f = os.path.relpath(os.path.join(os.environ['RESULTS'], os.environ['DATASET'], 'fastqc', s + '_fastqc.html'))
if os.path.exists(f):
size = os.path.getsize(f)/(1024.0)
str_msg += ' <a href="'
str_msg += f.replace(' ', '%20')
str_msg += '" target="_blank">'
str_msg += "{0:.2f}".format(size)
str_msg += 'KB</a> |'
else:
str_msg += ' --- |'
f = os.path.relpath(os.path.join(os.environ['RESULTS'], os.environ['DATASET'], 'fastqc', s + '_fastqc.zip'))
if os.path.exists(f):
tests, tot_seq, poor_quality, seq_len, gc_content = parse_fastqc_zip(s, f)
samples_data[s] = {'fastqc' :
{
'tests':tests,
'tot_seq': tot_seq,
'poor_quality': poor_quality,
'seq_len': seq_len,
'gc_content': gc_content
}
}
str_msg += "{:,}".format(tot_seq) + '|'
str_msg += seq_len + '|'
str_msg += gc_content + '|'
str_msg += str(poor_quality) + '|'
fail_tests = ''
for t in tests:
if tests[t] == 'FAIL':
if fail_tests:
fail_tests += '<br>'
fail_tests += t
str_msg += fail_tests + '|\n'
else:
str_msg += ' --- | --- | --- | --- | --- |\n'
display(Markdown(str_msg))
del str_msg
os.chdir(os.environ['NOTEBOOKS'])
name = '03 - Samples trimming'
str_msg = '<a href="' + name.replace(' ', '%20') + '.html" target="_blank">' + name + '</a>\n\n'
str_msg += '| Sample | Trimmed<br>Fastq | FastQC<br>Report | No of Reads<br>in fastq | Removed<br>Reads | Seq<br> Len | %GC | Poor<br>Quality | Fail<br>Tests |\n'
str_msg += '| --- | --- | --- | --- | --- | --- | --- | --- | --- |\n'
for s in samples:
str_msg += '| ' + s
str_msg += '| '
f = os.path.relpath(os.path.join(os.environ['RESULTS'], os.environ['DATASET'], 'trimmomatic', s + '_tr.fastq.gz'))
if os.path.exists(f):
size = os.path.getsize(f)/(1024.0*1024.0)
str_msg += ' <a href="'
str_msg += f.replace(' ', '%20')
str_msg += '" target="_blank">'
str_msg += "{0:.2f}".format(size)
str_msg += 'MB</a> |'
else:
str_msg += ' --- |'
f = os.path.relpath(os.path.join(os.environ['RESULTS'], os.environ['DATASET'], 'trimmomatic', 'fastqc', s + '_tr_fastqc.html'))
if os.path.exists(f):
size = os.path.getsize(f)/(1024.0)
str_msg += ' <a href="'
str_msg += f.replace(' ', '%20')
str_msg += '" target="_blank">'
str_msg += "{0:.2f}".format(size)
str_msg += 'KB</a> |'
else:
str_msg += ' --- |'
f = os.path.relpath(os.path.join(os.environ['RESULTS'], os.environ['DATASET'], 'trimmomatic', 'fastqc', s + '_tr_fastqc.zip'))
if os.path.exists(f):
tests, tot_seq, poor_quality, seq_len, gc_content = parse_fastqc_zip(s + '_tr', f)
str_msg += "{:,}".format(tot_seq) + '|'
str_msg += "{:,}".format(samples_data[s]['fastqc']['tot_seq'] - tot_seq) + '|'
str_msg += seq_len + '|'
str_msg += gc_content + '|'
str_msg += str(poor_quality) + '|'
fail_tests = ''
for t in tests:
if tests[t] == 'FAIL':
if fail_tests:
fail_tests += '<br>'
fail_tests += t
str_msg += fail_tests + '|\n'
samples_data[s]['trimmed'] = {
'tests':tests,
'tot_seq': tot_seq,
'poor_quality': poor_quality,
'seq_len': seq_len,
'gc_content': gc_content
}
else:
str_msg += ' --- | --- | --- | --- | --- | --- |\n'
display(Markdown(str_msg))
del str_msg
os.chdir(os.environ['NOTEBOOKS'])
name = '04 - Quantification'
str_msg = '<a href="' + name.replace(' ', '%20') + '.html" target="_blank">' + name + '</a>\n'
str_msg += '### Reference genome\n**Gencode grch37, release-19**\n\n'
str_msg += '| Sample | Sorted<br>BAM | BAM<br>Index | STATS'
str_msg += '| Number<br>of input<br>reads | Uniquely<br>mapped<br>reads % '
str_msg += '| % of reads<br>mapped to<br>multiple<br>loci '
str_msg += '| Number of<br>splices:<br>Annotated<br>(sjdb) '
str_msg += '| Mismatch<br>rate per<br>base, % '
str_msg += '|\n| --- | --- | --- | --- '
str_msg += '| --- | --- '
str_msg += '| --- '
str_msg += '| --- '
str_msg += '| --- '
str_msg += '|\n'
for s in samples:
str_msg += '| ' + s
str_msg += '| '
f = os.path.relpath(os.path.join(os.environ['RESULTS'], os.environ['DATASET'], 'quantification', s + '_tr_sorted.bam'))
if os.path.exists(f):
size = os.path.getsize(f)/(1024.0*1024.0)
str_msg += ' <a href="'
str_msg += f.replace(' ', '%20')
str_msg += '" target="_blank">'
str_msg += "{0:.2f}".format(size)
str_msg += 'MB</a> |'
else:
str_msg += ' --- |'
f = os.path.relpath(os.path.join(os.environ['RESULTS'], os.environ['DATASET'], 'quantification', s + '_tr_sorted.bam.bai'))
if os.path.exists(f):
size = os.path.getsize(f)/(1024.0*1024.0)
str_msg += ' <a href="'
str_msg += f.replace(' ', '%20')
str_msg += '" target="_blank">'
str_msg += "{0:.2f}".format(size)
str_msg += 'MB</a> |'
else:
str_msg += ' --- |'
f = os.path.relpath(os.path.join(os.environ['RESULTS'], os.environ['DATASET'], 'quantification', s + '_trLog.final.out'))
if os.path.exists(f):
size = os.path.getsize(f)/(1024.0)
str_msg += ' <a href="'
str_msg += f.replace(' ', '%20')
str_msg += '" target="_blank">'
str_msg += "{0:.2f}".format(size)
str_msg += 'KB</a> |'
stats = {}
with open(f) as fin:
for line in fin:
fields = line.strip().replace(' |', '').split('\t')
if len(fields) == 2:
stats[fields[0]] = fields[1]
samples_data[s]['alignment'] = {
'unique': int(stats['Uniquely mapped reads number']),
'multiple': int(stats['Number of reads mapped to multiple loci']),
'unmapped': int(stats['Number of input reads']) - (int(stats['Uniquely mapped reads number']) + int(stats['Number of reads mapped to multiple loci']))
}
str_msg += "{:,}".format(int(stats['Number of input reads'])) + ' |'
str_msg += "{0:.2f}".format(float(stats['Uniquely mapped reads %'].replace('%',''))) + ' |'
str_msg += "{0:.2f}".format(float(stats['% of reads mapped to multiple loci'].replace('%',''))) + ' |'
str_msg += "{:,}".format(int(stats['Number of splices: Annotated (sjdb)'])) + ' |'
str_msg += "{0:.2f}".format(float(stats['Mismatch rate per base, %'].replace('%',''))) + ' |'
str_msg += '\n'
else:
str_msg += ' --- | --- | --- | --- |\n'
display(Markdown(str_msg))
del str_msg
plt.figure(figsize=(10, 6))
N = 0
unique = []
multiple = []
unmapped = []
trimmed = []
for s in samples:
if 'alignment' in samples_data[s]:
N += 1
unique.append(samples_data[s]['alignment']['unique'])
multiple.append(samples_data[s]['alignment']['multiple'])
unmapped.append(samples_data[s]['alignment']['unmapped'])
trimmed.append(samples_data[s]['fastqc']['tot_seq'] - samples_data[s]['trimmed']['tot_seq'])
ind = np.arange(N)
width = 0.45
p4 = plt.bar(ind, trimmed, width, color='#2eea15')
p3 = plt.bar(ind, unmapped, width, bottom=np.array(trimmed), color='#ea1563')
p2 = plt.bar(ind, multiple, width, bottom=np.array(trimmed) + np.array(unmapped), color='#ff7311')
p1 = plt.bar(ind, unique, width, bottom=np.array(trimmed) + np.array(unmapped) + np.array(multiple), color='#1c6cab')
plt.ylabel('No. of Reads')
plt.xticks(ind, samples, rotation='vertical')
plt.legend((p1[0], p2[0], p3[0], p4[0]), ('Unique Mapped', 'Multiple Mapped', 'Unmapped', 'Trimmed'))
plt.show()
plt.close()
str_msg = '### RSeQC BAM Stats\n\n'
str_msg += '| Sample | Total records | QC failed | Optical/PCR duplicate '
str_msg += '| mapq < mapq_cut (non-unique) | mapq >= mapq_cut (unique) '
str_msg += '| Reads map to \'+\' '
str_msg += '| Reads map to \'-\' '
str_msg += '| Non-splice reads '
str_msg += '| Splice reads '
str_msg += '|\n| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- '
str_msg += '|\n'
for s in samples:
str_msg += '| ' + s
str_msg += '| '
f = os.path.relpath(os.path.join(os.environ['RESULTS'], os.environ['DATASET'], 'quantification', s + '_tr_sorted_rseqc.bam_stat.txt'))
if os.path.exists(f):
stats = {}
with open(f) as fin:
for line in fin:
fields = line.strip().split(':')
if len(fields) == 2:
stats[fields[0]] = fields[1].strip()
str_msg += "{:,}".format(int(stats['Total records'])) + ' |'
str_msg += "{:,}".format(int(stats['QC failed'])) + ' |'
str_msg += "{:,}".format(int(stats['Optical/PCR duplicate'])) + ' |'
str_msg += "{:,}".format(int(stats['mapq < mapq_cut (non-unique)'])) + ' |'
str_msg += "{:,}".format(int(stats['mapq >= mapq_cut (unique)'])) + ' |'
str_msg += "{:,}".format(int(stats['Reads map to \'+\''])) + ' |'
str_msg += "{:,}".format(int(stats['Reads map to \'-\''])) + ' |'
str_msg += "{:,}".format(int(stats['Non-splice reads'])) + ' |'
str_msg += "{:,}".format(int(stats['Splice reads'])) + ' |'
else:
str_msg += ' | --- | --- | --- | --- | --- | --- | --- | --- | --- '
str_msg += '\n'
display(Markdown(str_msg))
del str_msg
img_size = 250
str_msg = '### RSeQC PDF plots\n\n'
str_msg += 'Click on figure to retrieve original PDF file\n\n'
str_msg += '| Sample | Boxplot | HeatMap | Splice Events | Junction Saturation '
str_msg += '|\n| --- | --- | --- | --- | --- '
str_msg += '|\n'
for s in samples:
str_msg += '| ' + s
str_msg += '| '
f = os.path.relpath(os.path.join(os.environ['RESULTS'], os.environ['DATASET'], 'quantification', s + '_tr_sorted_rseqc.qual.boxplot.pdf'))
if os.path.exists(f):
with WImage(filename=f) as img:
img.resize(img_size, img_size)
img = img.convert('png')
str_msg += ' <a href="'
str_msg += f.replace(' ', '%20')
str_msg += '" target="_blank">'
str_msg += '<img src="data:image/png;base64,' + base64.b64encode(img.make_blob()).decode('utf-8') + '">'
str_msg += '</a> |'
else:
str_msg += ' --- |'
f = os.path.relpath(os.path.join(os.environ['RESULTS'], os.environ['DATASET'], 'quantification', s + '_tr_sorted_rseqc.qual.heatmap.pdf'))
if os.path.exists(f):
with WImage(filename=f) as img:
img.resize(img_size, img_size)
img = img.convert('png')
str_msg += ' <a href="'
str_msg += f.replace(' ', '%20')
str_msg += '" target="_blank">'
str_msg += '<img src="data:image/png;base64,' + base64.b64encode(img.make_blob()).decode('utf-8') + '">'
str_msg += '</a> |'
else:
str_msg += ' --- |'
f = os.path.relpath(os.path.join(os.environ['RESULTS'], os.environ['DATASET'], 'quantification', s + '_tr_sorted_rseqc.splice_events.pdf'))
if os.path.exists(f):
with WImage(filename=f) as img:
img.resize(img_size, img_size)
img = img.convert('png')
str_msg += ' <a href="'
str_msg += f.replace(' ', '%20')
str_msg += '" target="_blank">'
str_msg += '<img src="data:image/png;base64,' + base64.b64encode(img.make_blob()).decode('utf-8') + '">'
str_msg += '</a> |'
else:
str_msg += ' --- |'
f = os.path.relpath(os.path.join(os.environ['RESULTS'], os.environ['DATASET'], 'quantification', s + '_tr_sorted_rseqc.junctionSaturation_plot.pdf'))
if os.path.exists(f):
with WImage(filename=f) as img:
img.resize(img_size, img_size)
img = img.convert('png')
str_msg += ' <a href="'
str_msg += f.replace(' ', '%20')
str_msg += '" target="_blank">'
str_msg += '<img src="data:image/png;base64,' + base64.b64encode(img.make_blob()).decode('utf-8') + '">'
str_msg += '</a> |'
else:
str_msg += ' --- |'
str_msg += '\n'
display(Markdown(str_msg))
del str_msg
os.chdir(os.environ['NOTEBOOKS'])
name = '04 - Quantification'
str_msg = '<a href="' + name.replace(' ', '%20') + '.html" target="_blank">' + name + '</a>\n\n'
str_msg += '<a href="../results/' + os.environ['DATASET'] + '/" target="_blank">Results folder</a>\n\n'
columns = ['ExonTPM', 'ExonReads']
data = {}
samples = [ f.replace('_tr_sorted.bam','') for ds,dr,fs in os.walk(os.path.join(os.environ['RESULTS'], os.environ['DATASET'], 'quantification')) for f in fs if f.endswith('_sorted.bam')]
for c in columns:
f = os.path.relpath(os.path.join(os.environ['RESULTS'], os.environ['DATASET'], 'quantification', c + '.tsv'))
if os.path.exists(f):
str_msg += '[' + c + '.tsv](' + f + ')\n'
data[c] = pandas.read_csv(f, sep='\t')
display(Markdown(str_msg))
del str_msg
display(Markdown("### Exon TPM and reads distribution per sample"))
plt.figure(figsize=(10, 8))
toPlot = []
for s in samples:
if s in data['ExonTPM']:
for r in data['ExonTPM'][s]:
toPlot.append([r, s])
d = pandas.DataFrame(toPlot, columns=['TPM', 'Sample'])
ax = sns.boxplot(y='Sample', x='TPM', data=d, orient="h", palette="Set2")
ax.set_title('Exon TPM')
plt.figure(figsize=(10, 8))
toPlot = []
for s in samples:
if s in data['ExonReads']:
for r in data['ExonReads'][s]:
toPlot.append([r, s])
d = pandas.DataFrame(toPlot, columns=['Reads', 'Sample'])
ax = sns.boxplot(y='Sample', x='Reads', data=d, orient="h", palette="Set2")
ax.set_title('Exon reads')
f = os.path.relpath(os.path.join(os.environ['RESULTS'], os.environ['DATASET'], 'differential_expression', 'recall.csv'))
df = pandas.read_csv(f, sep='\t')
display(df)
df.plot.box(title='Recall')
df.plot.barh(figsize=(12, 10), title='Recall').set_yticklabels(df['Reference'], rotation=0);
f = os.path.relpath(os.path.join(os.environ['RESULTS'], os.environ['DATASET'], 'differential_expression', 'precision.csv'))
df = pandas.read_csv(f, sep='\t')
display(df)
df.plot.box(title='Precision')
df.plot.barh(figsize=(12, 10), title='Precision').set_yticklabels(df['Reference'], rotation=0);
We used figure 5a plotting our workflow recall and precision results as described in the paper. Our workflow shows recall and precision values similar to analyzed workflows that uses the same tools and quantification tool similar to TPMCalculator.

